# Data generation file for:
# Carnegie, Clark, and Zucker, "Global Governance under Populism"
# World Politics

# Software: R 4.3.2 / RStudio 2024.04.1+748 / macOS Sonoma 14.5
# Hardware: 2019 MacBook Pro, 2.4 GHz 8-Core Intel Core i9, 64 GB

# Loading packages
pacman::p_load(tidyverse,
               data.table,
               janitor,
               magrittr,
               readxl)

# ------------------ #
# -> DATA LOADING ####
# ------------------ #

# Loading data on WDI scientific indicators with source information
wdi_science <- read_rds("wdi_science.rds")

# Loading excluded (derivative / duplicate) variable names
excluded_vars <- read_rds("excluded_vars.rds")

# Load the full World Development Indicators (WDI) dataset from a CSV file,
# clean column names, and reshape the data to long format
wdi_full <- read_csv("wdi_data_full.csv") %>%
  clean_names() %>%
  pivot_longer(cols = starts_with("x"),
               names_to = "year", 
               values_to = "value")

# -- Convert the data frame to a data.table for efficient data manipulation
setDT(wdi_full)

# -- Exclude derivative variables
wdi_full <- wdi_full[!indicator_name %in% excluded_vars$Variable]

# -- Convert the 'year' column from character to numeric by removing the "x" prefix
wdi_full <- wdi_full[, year := as.numeric(gsub("x", "", year))]

# -- Remove rows where the year is 64
wdi_full <- wdi_full[year != 64]

# -- Create logical columns that categorize WDI indicators
wdi_full <- wdi_full[, science := indicator_name %in% wdi_science$variable]
wdi_full <- wdi_full[, science_raw := indicator_name %in% wdi_science$variable[wdi_science$raw_state == 1]]
wdi_full <- wdi_full[, science_imp := indicator_name %in% wdi_science$variable[wdi_science$raw_state == 0]]
wdi_full <- wdi_full[, nonscience := (!indicator_name %in% wdi_science$variable)]

# Create science-only WDI dataset
wdi_science_country_year <- wdi_full[science == 1]

# -- Distinguishing by raw vs. imputed
wdi_science_country_year_raw <- wdi_full[science_raw == 1]
wdi_science_country_year_imp <- wdi_full[science_imp == 1]

# Create non-science WDI dataset
wdi_nonscience_country_year <- wdi_full[nonscience == 1]

# Create copy of full dataset
wdi_all <- copy(wdi_full)

# Creating country-year scientific missingness dataset
wdi_full_by_var <- wdi_full[, .(VAR_NOT_NA = mean(!is.na(value))),
                            by = list(indicator_name, year)]

wdi_science_country_year <- merge.data.table(x = wdi_science_country_year,
                                             y = wdi_full_by_var,
                                             by = c("year", "indicator_name"),
                                             all.x = TRUE)

wdi_science_country_year2 <- wdi_science_country_year[, .(SCIENCE_NA = mean(is.na(value)),
                                                          SCIENCE_NA_WEIGHT = weighted.mean(is.na(value), w = VAR_NOT_NA),
                                                          SCIENCE_NA_RAW = mean(is.na(value[science_raw == TRUE])),
                                                          SCIENCE_NA_RAW_WEIGHT = weighted.mean(is.na(value[science_raw == TRUE]), w = VAR_NOT_NA[science_raw == TRUE]),
                                                          SCIENCE_NA_IMP = mean(is.na(value[science_imp == TRUE])),
                                                          SCIENCE_NA_IMP_WEIGHT = weighted.mean(is.na(value[science_imp == TRUE]), w = VAR_NOT_NA[science_imp == TRUE])), 
                                                      by = list(country_code, year)]
wdi_science_country_year2 <- wdi_science_country_year2[year != 64]

# Creating country-year non-scientific missingness dataset
wdi_nonscience_country_year <- merge.data.table(x = wdi_nonscience_country_year,
                                                y = wdi_full_by_var,
                                                by = c("year", "indicator_name"),
                                                all.x = TRUE)

wdi_nonscience_country_year2 <- wdi_nonscience_country_year[, .(NONSCIENCE_NA = mean(is.na(value))), 
                                                            by = list(country_code, year)]
wdi_nonscience_country_year2 <- wdi_nonscience_country_year2[year != 64]

WDI_MISS <- merge.data.table(x = wdi_science_country_year2,
                             y = wdi_nonscience_country_year2,
                             by = c("country_code", "year")) %>%
  filter(year %in% 1990:2018) # time frame of analyses

# Loading Polity data and lagging by one year
polity <- read_excel("p5v2018.xls") %>%
  mutate(scode = countrycode::countrycode(country, "country.name", "wb")) %>%
  select(scode, year, polity2)

polity_l1 <- polity %>% 
  mutate(year = year + 1) %>%
  distinct(scode, year, .keep_all = TRUE)

WDI_MISS <- WDI_MISS %>%
  left_join(polity_l1, by = c("country_code" = "scode", "year"))

# Loading covariate: DPI (government ideology)
# -- right-1, left-3, center-2, no information-0, no executive-NA
dpi <- read_csv("DPI2015.csv") %>%
  mutate(countryname = case_when(countryname == "Cent. Af. Rep." ~ "Central African Republic",
                                 countryname == "Dom. Rep." ~ "Dominican Republic",
                                 countryname == "GDR" ~ "East Germany",
                                 countryname == "PRC" ~ "China",
                                 countryname == "PRK" ~ "North Korea",
                                 countryname == "ROK" ~ "South Korea",
                                 countryname == "S. Africa" ~ "South Africa",
                                 TRUE ~ as.character(countryname))) %>%
  mutate(country_code = countrycode::countrycode(countryname, "country.name", "wb")) %>%
  select(country_code, year, execrlc)

dpi <- dpi %>%
  mutate(execrlc2 = ifelse(execrlc %in% c(-999), NA, execrlc)) %>%
  mutate(rightwing = as.integer(execrlc2 == 1))

dpi_l1 <- dpi %>% 
  mutate(year = year + 1)  %>%
  distinct(country_code, year, .keep_all = TRUE)

WDI_MISS <- WDI_MISS %>%
  left_join(dpi_l1, by = c("country_code", "year"))

# Loading covariate: GDP per capita
gdp_per_capita <- wdi_full[indicator_name == "GDP per capita (constant 2010 US$)"] %>%
  select(country_code, year, value) %>%
  rename(gdp_per_capita = value) %>%
  distinct(country_code, year, .keep_all = TRUE)
gdp_per_capita_l1 <- gdp_per_capita %>% mutate(year = year + 1)

WDI_MISS <- WDI_MISS %>%
  left_join(gdp_per_capita_l1, by = c("country_code", "year"))

# Loading covariate: IMF program participation
imf <- read_csv("IMFagg.csv") %>%
  mutate(country_code = countrycode::countrycode(ccode, origin = "iso3c", destination = "wb")) %>%
  distinct(country_code, year, .keep_all = TRUE)
imf_l1 <- imf %>% mutate(year = year + 1)

WDI_MISS <- WDI_MISS %>%
  left_join(imf_l1, by = c("country_code", "year")) %>%
  mutate(IMF = replace_na(IMF, 0))

# Loading covariate: Fossil fuel (% energy consumption)
fossil_fuels <- wdi_full[indicator_name == "Fossil fuel energy consumption (% of total)"] %>%
  select(country_code, year, value) %>%
  rename(fossil_fuels = value) %>%
  distinct(country_code, year, .keep_all = TRUE)
fossil_fuels_l1 <- fossil_fuels %>% mutate(year = year + 1)

WDI_MISS <- WDI_MISS %>%
  left_join(fossil_fuels_l1, by = c("country_code", "year"))

# Loading covariate: Value added by agriculture, forestry, and fishing (% GDP)
agriculture_va <- wdi_full[indicator_name == "Agriculture, forestry, and fishing, value added (% of GDP)"] %>%
  select(country_code, year, value) %>%
  rename(agriculture_va = value) %>%
  distinct(country_code, year, .keep_all = TRUE)
agriculture_va_l1 <- agriculture_va %>% mutate(year = year + 1)

WDI_MISS <- WDI_MISS %>%
  left_join(agriculture_va_l1, by = c("country_code", "year"))

# Loading covariate: Net ODA and official assistance received
net_oda <- wdi_full[indicator_name == "Net official development assistance and official aid received (current US$)"] %>%
  select(country_code, year, value) %>%
  rename(net_oda = value) %>%
  distinct(country_code, year, .keep_all = TRUE)
net_oda_l1 <- net_oda %>% mutate(year = year + 1)

WDI_MISS <- WDI_MISS %>%
  left_join(net_oda_l1, by = c("country_code", "year"))

# Load populism data from Funke et al. (AER)
funke <- read_csv("funke_data.csv") %>%
  mutate(ctry = countrycode::countrycode(ctry, "country.name", "wb"))

funke_l1 <- funke %>%
  mutate(year = year + 1) %>%
  rename(right_aer_l1 = right,
         fpopulism_aer_l1 = fpopulism)
funke_l2 <- funke_l1 %>%
  mutate(year = year + 1) %>%
  rename(right_aer_l2 = right_aer_l1,
         fpopulism_aer_l2 = fpopulism_aer_l1)
funke_l3 <- funke_l2 %>%
  mutate(year = year + 1) %>%
  rename(right_aer_l3 = right_aer_l2,
         fpopulism_aer_l3 = fpopulism_aer_l2)

WDI_MISS <- WDI_MISS %>%
  left_join(funke_l1, by = c("country_code" = "ctry", "year")) %>%
  left_join(funke_l2, by = c("country_code" = "ctry", "year")) %>%
  left_join(funke_l3, by = c("country_code" = "ctry", "year")) %>%
  mutate_at(vars(fpopulism_aer_l1,
                 fpopulism_aer_l2,
                 fpopulism_aer_l3), ~ replace_na(., 0))

# Load populism data from the Blair Institute (https://institute.global/policy/populists-power-around-world)
blair <- read.csv("blair_data.csv") %>%
  mutate(country_code = countrycode::countrycode(country, "country.name", "wb"))

# -- Summarize populism data by country and year
blair <- blair %>%
  group_by(country_code, year) %>%
  summarize(populism = mean(populism, na.rm = TRUE)) %>%
  ungroup()

# -- Create a one-year lag version of the dataset
blair_l1 <- blair %>%
  mutate(year = year + 1) %>%
  # Ensure that there are no duplicate rows for the same country-year
  distinct(country_code, year, .keep_all = TRUE)

WDI_MISS <- WDI_MISS %>%
  left_join(blair_l1, by = c("country_code", "year")) %>%
  mutate(populism = replace_na(populism, 0))

# Scaling variables
SCIENCE_NA_mean <- mean(WDI_MISS$SCIENCE_NA, na.rm = TRUE)
SCIENCE_NA_sd <- sd(WDI_MISS$SCIENCE_NA, na.rm = TRUE)

NONSCIENCE_NA_mean <- mean(WDI_MISS$NONSCIENCE_NA, na.rm = TRUE)
NONSCIENCE_NA_sd <- sd(WDI_MISS$NONSCIENCE_NA, na.rm = TRUE)

SCIENCE_NA_RAW_mean <- mean(WDI_MISS$SCIENCE_NA_RAW, na.rm = TRUE)
SCIENCE_NA_RAW_sd <- sd(WDI_MISS$SCIENCE_NA_RAW, na.rm = TRUE)

SCIENCE_NA_IMP_mean <- mean(WDI_MISS$SCIENCE_NA_IMP, na.rm = TRUE)
SCIENCE_NA_IMP_sd <- sd(WDI_MISS$SCIENCE_NA_IMP, na.rm = TRUE)

SCIENCE_NA_WEIGHT_mean <- mean(WDI_MISS$SCIENCE_NA_WEIGHT, na.rm = TRUE)
SCIENCE_NA_WEIGHT_sd <- sd(WDI_MISS$SCIENCE_NA_WEIGHT, na.rm = TRUE)

SCIENCE_NA_RAW_WEIGHT_mean <- mean(WDI_MISS$SCIENCE_NA_RAW_WEIGHT, na.rm = TRUE)
SCIENCE_NA_RAW_WEIGHT_sd <- sd(WDI_MISS$SCIENCE_NA_RAW_WEIGHT, na.rm = TRUE)

SCIENCE_NA_IMP_WEIGHT_mean <- mean(WDI_MISS$SCIENCE_NA_IMP_WEIGHT, na.rm = TRUE)
SCIENCE_NA_IMP_WEIGHT_sd <- sd(WDI_MISS$SCIENCE_NA_IMP_WEIGHT, na.rm = TRUE)

WDI_MISS %<>%
  mutate(SCIENCE_NA_sd = (SCIENCE_NA - SCIENCE_NA_mean)/SCIENCE_NA_sd) %>%
  mutate(NONSCIENCE_NA_sd = (NONSCIENCE_NA - NONSCIENCE_NA_mean)/NONSCIENCE_NA_sd) %>%
  mutate(SCIENCE_NA_RAW_sd = (SCIENCE_NA_RAW - SCIENCE_NA_RAW_mean)/SCIENCE_NA_RAW_sd) %>%
  mutate(SCIENCE_NA_IMP_sd = (SCIENCE_NA_IMP - SCIENCE_NA_IMP_mean)/SCIENCE_NA_IMP_sd) %>%
  mutate(SCIENCE_NA_WEIGHT_sd = (SCIENCE_NA_WEIGHT - SCIENCE_NA_WEIGHT_mean)/SCIENCE_NA_WEIGHT_sd) %>%
  mutate(SCIENCE_NA_RAW_WEIGHT_sd = (SCIENCE_NA_RAW_WEIGHT - SCIENCE_NA_RAW_WEIGHT_mean)/SCIENCE_NA_RAW_WEIGHT_sd) %>%
  mutate(SCIENCE_NA_IMP_WEIGHT_sd = (SCIENCE_NA_IMP_WEIGHT - SCIENCE_NA_IMP_WEIGHT_mean)/SCIENCE_NA_IMP_WEIGHT_sd) %>%
  mutate(gdp_per_capita_ln = log1p(gdp_per_capita)) %>%
  mutate(execrlc2 = ifelse(execrlc %in% c(-999), NA, execrlc)) %>% #in DPI data, -999 = NA
  mutate(rightwing = as.integer(execrlc2 == 1))

# Populism time in office
pop_spell_breaks <- funke %>%
  mutate(continuous = ifelse(year == dplyr::lag(year, n = 1) + 1, 1, 0)) %>%
  mutate(continuous = ifelse(is.na(continuous), 0, continuous)) %>%
  filter(continuous == 0) %>%
  group_by(ctry) %>%
  mutate(spell = row_number()) %>%
  ungroup() %>%
  dplyr::select(ctry, year, spell)

pop_time_in_office <- funke %>%
  left_join(pop_spell_breaks, by = c("ctry", "year")) %>%
  mutate(spell = zoo::na.locf(spell)) %>%
  group_by(ctry, spell) %>%
  mutate(populism_tenure = cumsum(fpopulism)) %>%
  ungroup() %>%
  mutate(year = year + 1) %>%
  mutate(populism_tenure = replace_na(populism_tenure, 0)) %>%
  dplyr::select(ctry, year, populism_tenure)

WDI_MISS %<>%
  left_join(pop_time_in_office, by = c("country_code" = "ctry", "year"))

# Three-year lag for covariates

WDI_MISS_l3 <- WDI_MISS %>%
  mutate(year = year + 2) %>% # lagging by two, as these variables are already lagged by one (2+1=3)
  dplyr::select(polity2, rightwing,
                gdp_per_capita_ln, IMF,
                country_code, year)

WDI_MISS %<>%
  left_join(WDI_MISS_l3, by = c("country_code", "year"), suffix = c("", "_l3"))

# Adding new covariates for appendix tests
growth_data <- wdi_full %>%
  filter(indicator_name == "GDP growth (annual %)") %>%
  dplyr::select(country_code, year, value) %>%
  rename(gdp_growth_annual_pct = value) %>%
  mutate(year = year + 1)

unemployment_data <- wdi_full %>%
  filter(indicator_name == "Unemployment, total (% of total labor force) (modeled ILO estimate)") %>%
  dplyr::select(country_code, year, value) %>%
  rename(unemployment_ilo_estimate = value) %>%
  mutate(year = year + 1)

crisis_data <- read_excel("global_crisis_data.xlsx") %>%
  clean_names() %>%
  dplyr::select(cc3, year, banking_crisis, systemic_crisis, currency_crises, inflation_crises) %>%
  slice(-1) %>%
  mutate_at(vars(banking_crisis, systemic_crisis, currency_crises, inflation_crises), as.numeric) %>%
  rowwise() %>%
  mutate(crisis = banking_crisis + systemic_crisis + currency_crises + inflation_crises) %>%
  mutate(country_code = countrycode::countrycode(cc3, "iso3c", "wb")) %>%
  dplyr::select(country_code, year, crisis) %>%
  mutate(year = year + 1)

V_Dem_CY_Full_Others_v11_1 <- read_csv("V-Dem-CY-Full-Others-v11.1.csv")
vdem <- data.frame(V_Dem_CY_Full_Others_v11_1)
vdem <- data.frame(cbind(COWcode=vdem$COWcode, year=vdem$year, legitideol=vdem$v2exl_legitideol, natbin=vdem$v2exl_legitideolcr_0))
vdem$country_code <- countrycode::countrycode(vdem$COWcode, origin = "cown", destination = "wb")
vdem <- vdem[-c(1)]
vdem$year <- vdem$year + 1
vdem$natcon <- vdem$natbin*vdem$legitideol # strength of nationalism

wbcond <- read.csv("dpadproj.csv")
wbcond <- wbcond[c(1, 7, 20)]
wbcond$ccode <- countrycode::countrycode(wbcond$Country, origin = "country.name", destination = "wb")
wbcond$X..Prior.Actns <- as.numeric(wbcond$X..Prior.Actns)
wbcond <- aggregate(wbcond$X..Prior.Actns, list(wbcond$ccode, wbcond$FY.Q), FUN = sum)
colnames(wbcond) <- c("country_code", "year", "conditions")
wbcond$year <- wbcond$year + 1

WDI_MISS %<>%
  left_join(growth_data, by = c("country_code", "year")) %>%
  left_join(unemployment_data, by = c("country_code", "year")) %>%
  left_join(crisis_data, by = c("country_code", "year")) %>%
  left_join(vdem, by = c("country_code", "year")) %>%
  left_join(wbcond, by = c("country_code", "year")) %>%
  mutate(conditions = ifelse(is.na(conditions), 0, conditions))

# Writing WDI missingness analysis dataset, after removing an extraneous variables
WDI_MISS %<>%
  select(-c("ccode", "natbin", "legitideol"))

write_csv(WDI_MISS, "wdi_miss.csv")

# Loading GHG emissions data
# -- Third-party estimated
wdig <- read_csv("wdighgtot.csv") %>%
  select(-c(2, 3)) %>%
  rename(year = 1, country_code = 2, ghgest = 3) %>%
  mutate(across(c(year, ghgest), as.numeric))

# -- State-reported
unfg <- read.csv("unfccctot.csv", stringsAsFactors = FALSE)
unfg <- unfg[-c(2,32)]
colnames(unfg) <- c("ctry", 1990:2018)
for (i in 2:30) {
  for (j in 1:52) {
    unfg[j,i] <- as.numeric(gsub(",","",unfg[j,i]))
  }
}
unfg <- reshape(unfg, 
                direction = "long",
                varying = list(names(unfg)[2:30]),
                v.names = "ghg",
                idvar = c("ctry"),
                timevar = "year",
                times = 1990:2018)
unfg$ghg <- as.numeric(unfg$ghg)
unfg$country_code <- countrycode::countrycode(unfg$ctry, origin = "country.name", destination = "iso3c")
unfg <- unfg[-c(1)]
unfg$annex <- 1

unfccc_nonannex_list <- list.files("ghgunfccc", pattern = "*csv", full.names = TRUE)
unfccc_nonannex <- lapply(unfccc_nonannex_list, read_csv)
unfccc_nonannex2 <- lapply(seq_along(unfccc_nonannex),
                           function(i) {
                             colnames(unfccc_nonannex[[i]]) <- slice(unfccc_nonannex[[i]], 3)
                             colnames(unfccc_nonannex[[i]]) <- gsub(pattern = "Last Reported Year \\(", replacement = "", colnames(unfccc_nonannex[[i]]))
                             colnames(unfccc_nonannex[[i]]) <- gsub(pattern = "Last Inventory Year \\(", replacement = "", colnames(unfccc_nonannex[[i]]))
                             colnames(unfccc_nonannex[[i]]) <- gsub(pattern = "\\)", replacement = "", colnames(unfccc_nonannex[[i]]))
                             unfccc_nonannex[[i]]$Country <- gsub(pattern = "Query results for  — Party: ",
                                                                  replacement = "",
                                                                  unfccc_nonannex[[i]]$Category[[1]])
                             unfccc_nonannex[[i]]$Country <- gsub(pattern = " — Years: All years — Category: Total GHG emissions including LULUCF/LUCF — Gas: Aggregate GHGs — Unit: Tg CO₂ equivalent",
                                                                  replacement = "",
                                                                  unfccc_nonannex[[i]]$Country)
                             unfccc_nonannex[[i]] <- slice(unfccc_nonannex[[i]], 4)
                             unfccc_nonannex[[i]] %<>% 
                               dplyr::select(-Category) %>%
                               mutate_all(~ ifelse(. == "—", NA, .))
                             unfccc_nonannex[[i]]
                           })
unfccc_nonannex3 <- rbindlist(unfccc_nonannex2,
                              fill = TRUE) %>%
  dplyr::select(Country, everything()) %>%
  pivot_longer(data = .,
               cols = `1990`:`2018`,
               names_to = "Year",
               values_to = "ghg")
unfccc_nonannex3$country_code <- countrycode::countrycode(unfccc_nonannex3$Country, 
                                                          origin = "country.name", 
                                                          destination = "iso3c")
unfccc_nonannex3 %<>% dplyr::select(-Country) %>%
  rename(year = Year) %>%
  mutate(year = as.numeric(year)) %>%
  mutate(ghg = as.numeric(ghg))

unfccc_nonannex3 %<>% mutate(ghg = ghg*1000)

unfccc_all <- bind_rows(unfccc_nonannex3,
                        unfg)

# Loading controls for GHG gap tests
dqcovs <- read_csv("dqcontrols.csv")
dqcovs <- dqcovs[-c(2:3)]
colnames(dqcovs) <- c("year", "country_code", "ff.energy.consumption", "ag.va", "coal.rents", "min.rents", "nat.gas.rents",
                      "oil.rents", "forest.rents")
dqcovs[dqcovs == ".."] <- NA
dqcovs$ff.energy.consumption <- as.numeric(dqcovs$ff.energy.consumption)
dqcovs$ag.va <- as.numeric(dqcovs$ag.va)
dqcovs$coal.rents <- as.numeric(dqcovs$coal.rents)
dqcovs$min.rents <- as.numeric(dqcovs$min.rents)
dqcovs$nat.gas.rents <- as.numeric(dqcovs$nat.gas.rents)
dqcovs$oil.rents <- as.numeric(dqcovs$oil.rents)
dqcovs$forest.rents <- as.numeric(dqcovs$forest.rents)

# Merge alternate emissions data
ghgb <- left_join(wdig, unfccc_all, by = c("country_code", "year"))

# Calculate gap in reported emissions
ghgb <- ghgb %>%
  mutate(
    diff = log1p(abs(ghgest - ghg)),
    annex = ifelse(is.na(annex), 0, annex)
  )

# Merging into main WDI dataset
GHG_GAP <- merge(WDI_MISS, ghgb, by = c("country_code", "year"), all.x = TRUE)

GHG_GAP %<>% mutate(execrlc2 = case_when(execrlc == -999 | execrlc == 0 ~ NA_real_,
                                         execrlc == 1 ~ 1,
                                         execrlc == 2 ~ 2,
                                         execrlc == 3 ~ 3))

dqcovs$year <- as.numeric(dqcovs$year)
GHG_GAP <- left_join(GHG_GAP, dqcovs, by = c("country_code", "year"))
GHG_GAPsub <- GHG_GAP[GHG_GAP$annex == 1,]
GHG_GAPsub2 <- GHG_GAP[GHG_GAP$annex == 0,]

# Writing GHG gap data
write_csv(GHG_GAPsub, "ghg_gap_annex1.csv")

# Loading and cleaning archived WDI files (2005-2018)
w05 <- read_delim("WDI Archives/2005.txt") %>%
  clean_names() %>%
  pivot_longer(cols = x1960:x2004,
               names_to = "year",
               values_to = "value",
               values_drop_na = FALSE
  ) %>%
  mutate(year = as.integer(substr(year, 2, 5)),
         value = ifelse(value == "..", NA, value),
         version = 2005) %>%
  mutate(value = as.numeric(value))

w06 <- read_delim("WDI Archives/2006.txt") %>%
  clean_names() %>%
  pivot_longer(cols = x1960:x2005,
               names_to = "year",
               values_to = "value",
               values_drop_na = FALSE
  ) %>%
  mutate(year = as.integer(substr(year, 2, 5)),
         value = ifelse(value == "..", NA, value),
         version = 2006) %>%
  mutate(value = as.numeric(value))

w07 <- read_delim("WDI Archives/2007.txt") %>%
  clean_names() %>%
  pivot_longer(cols = x1960:x2006,
               names_to = "year",
               values_to = "value",
               values_drop_na = FALSE
  ) %>%
  mutate(year = as.integer(substr(year, 2, 5)),
         value = ifelse(value == "..", NA, value),
         version = 2007) %>%
  mutate(value = as.numeric(value))

w08 <- read_delim("WDI Archives/2008.txt") %>%
  clean_names() %>%
  pivot_longer(cols = x1960:x2007,
               names_to = "year",
               values_to = "value",
               values_drop_na = FALSE
  ) %>%
  mutate(year = as.integer(substr(year, 2, 5)),
         value = ifelse(value == "..", NA, value),
         version = 2008) %>%
  mutate(value = as.numeric(value))

w09 <- read_delim("WDI Archives/2009.txt") %>%
  clean_names() %>%
  pivot_longer(cols = x1960:x2008,
               names_to = "year",
               values_to = "value",
               values_drop_na = FALSE
  ) %>%
  mutate(year = as.integer(substr(year, 2, 5)),
         value = ifelse(value == "..", NA, value),
         version = 2009) %>%
  mutate(value = as.numeric(value))

w10 <- read_delim("WDI Archives/2010.txt") %>%
  clean_names() %>%
  pivot_longer(cols = x1960:x2009,
               names_to = "year",
               values_to = "value",
               values_drop_na = FALSE
  ) %>%
  mutate(year = as.integer(substr(year, 2, 5)),
         value = ifelse(value == "..", NA, value),
         version = 2010) %>%
  mutate(value = as.numeric(value))

w11 <- read_excel("WDI Archives/2011.xlsx", 
                  sheet = "WDI_GDF_Data") %>%
  clean_names() %>%
  pivot_longer(cols = x1960:x2010,
               names_to = "year",
               values_to = "value",
               values_drop_na = FALSE
  ) %>%
  mutate(year = as.integer(substr(year, 2, 5)),
         value = ifelse(value == "..", NA, value),
         version = 2011) %>%
  mutate(value = as.numeric(value))

w12 <- read_excel("WDI Archives/2012.xlsx", 
                  sheet = "WDI_GDF_Data") %>%
  clean_names() %>%
  pivot_longer(cols = x1960:x2011,
               names_to = "year",
               values_to = "value",
               values_drop_na = FALSE
  ) %>%
  mutate(year = as.integer(substr(year, 2, 5)),
         value = ifelse(value == "..", NA, value),
         version = 2012) %>%
  mutate(value = as.numeric(value))

w13 <- read_excel("WDI Archives/2013.xlsx", 
                  sheet = "Data") %>%
  clean_names() %>%
  rename(series_name = indicator_name,
         series_code = indicator_code) %>%
  pivot_longer(cols = x1960:x2012,
               names_to = "year",
               values_to = "value",
               values_drop_na = FALSE
  ) %>%
  mutate(year = as.integer(substr(year, 2, 5)),
         value = ifelse(value == "..", NA, value),
         version = 2013) %>%
  mutate(value = as.numeric(value))

w14 <- read_excel("WDI Archives/2014.xlsx", 
                  sheet = "Data") %>%
  clean_names() %>%
  rename(series_name = indicator_name,
         series_code = indicator_code) %>%
  pivot_longer(cols = x1960:x2013,
               names_to = "year",
               values_to = "value",
               values_drop_na = FALSE
  ) %>%
  mutate(year = as.integer(substr(year, 2, 5)),
         value = ifelse(value == "..", NA, value),
         version = 2014) %>%
  mutate(value = as.numeric(value))

w15 <- read_excel("WDI Archives/2015.xlsx", 
                  sheet = "Data") %>%
  clean_names() %>%
  rename(series_name = indicator_name,
         series_code = indicator_code) %>%
  pivot_longer(cols = x1960:x2014,
               names_to = "year",
               values_to = "value",
               values_drop_na = FALSE
  ) %>%
  mutate(year = as.integer(substr(year, 2, 5)),
         value = ifelse(value == "..", NA, value),
         version = 2015) %>%
  mutate(value = as.numeric(value))

w16 <- read_excel("WDI Archives/2016.xlsx", 
                  sheet = "Data") %>%
  clean_names() %>%
  rename(series_name = indicator_name,
         series_code = indicator_code) %>%
  pivot_longer(cols = x1960:x2015,
               names_to = "year",
               values_to = "value",
               values_drop_na = FALSE
  ) %>%
  mutate(year = as.integer(substr(year, 2, 5)),
         value = ifelse(value == "..", NA, value),
         version = 2016) %>%
  mutate(value = as.numeric(value))

w17 <- read_excel("WDI Archives/2017.xlsx", 
                  sheet = "Data") %>%
  clean_names() %>%
  rename(series_name = indicator_name,
         series_code = indicator_code) %>%
  pivot_longer(cols = x1960:x2016,
               names_to = "year",
               values_to = "value",
               values_drop_na = FALSE
  ) %>%
  mutate(year = as.integer(substr(year, 2, 5)),
         value = ifelse(value == "..", NA, value),
         version = 2017) %>%
  mutate(value = as.numeric(value))

w18 <- read_excel("WDI Archives/2018.xlsx", 
                  sheet = "Data") %>%
  clean_names() %>%
  rename(series_name = indicator_name,
         series_code = indicator_code) %>%
  pivot_longer(cols = x1960:x2017,
               names_to = "year",
               values_to = "value",
               values_drop_na = FALSE
  ) %>%
  mutate(year = as.integer(substr(year, 2, 5)),
         value = ifelse(value == "..", NA, value),
         version = 2018) %>%
  mutate(value = as.numeric(value))

## Stacking archived WDI files
wdi_stack <- bind_rows(w05,
                       w06,
                       w07,
                       w08,
                       w09,
                       w10,
                       w11,
                       w12,
                       w13,
                       w14,
                       w15,
                       w16,
                       w17,
                       w18)
setDT(wdi_stack)
rm(w05); rm(w06); rm(w07); rm(w08); rm(w09); rm(w10); rm(w11); rm(w12); rm(w13); rm(w14); rm(w15); rm(w16); rm(w17); rm(w18)

series_intersect <- wdi_stack %>%
  filter(year >= 1990) %>%
  group_by(version, series_code) %>%
  summarize(non_na = sum(!is.na(value), na.rm = FALSE)) %>%
  ungroup() %>%
  group_by(series_code) %>%
  tally() %>%
  filter(n == 14)
valid_vars <- series_intersect$series_code

country_intersect <- wdi_stack %>%
  filter(year >= 1990) %>%
  group_by(version, country_code) %>%
  summarize(non_na = sum(!is.na(value), na.rm = FALSE)) %>%
  ungroup() %>%
  group_by(country_code) %>%
  tally() %>%
  filter(n == 14)
valid_countries <- country_intersect$country_code

wdi_stack_valid <- wdi_stack[series_code %in% valid_vars & country_code %in% valid_countries]

## Backfilling across countries
wdi_stack_valid_agg_by_var_year <- wdi_stack_valid[, .(na_rate = mean(is.na(value), na.rm = FALSE)),
                                                   by = c("year", "version")]
wdi_stack_valid_agg_by_var_year <- wdi_stack_valid_agg_by_var_year[, years_till_version := version - year]

## Writing backfilling data
write_csv(wdi_stack_valid_agg_by_var_year, "wdi_stack_valid_agg_by_var_year.csv")

## Separating by populist/nonpopulist
populists <- read_csv("funke_data.csv") %>%
  mutate(ctry = countrycode::countrycode(ctry, "country.name", "wb"))

populists_l1 <- populists %>%
  mutate(year = year + 1) %>%
  rename(right_aer_l1 = right,
         fpopulism_aer_l1 = fpopulism)
setDT(populists_l1)

wdi_stack_valid2 <- merge.data.table(
  x = wdi_stack_valid,
  y = populists_l1,
  by.x = c("country_code", "year"),
  by.y = c("ctry", "year"),
  all.x = TRUE
)
wdi_stack_valid2 <- wdi_stack_valid2[, fpopulism_aer_l1 := ifelse(is.na(fpopulism_aer_l1),
                                                                  0, fpopulism_aer_l1)]

wdi_stack_valid2 <- merge.data.table(
  x = wdi_stack_valid2,
  y = populists_l1,
  by.x = c("country_code", "version"),
  by.y = c("ctry", "year"),
  all.x = TRUE,
  suffixes = c("", "_version")
)

wdi_stack_valid2 <- wdi_stack_valid2[, fpopulism_aer_l1_version := ifelse(is.na(fpopulism_aer_l1_version),
                                                                          0, fpopulism_aer_l1_version)]

wdi_stack_valid2 <- wdi_stack_valid2[version - year >= 2]
wdi_stack_valid2 <- wdi_stack_valid2[year >= 2003]
wdi_stack_valid2 <- wdi_stack_valid2[, missing_initially := ifelse(version-year == 2,
                                                                   ifelse(is.na(value),
                                                                          1,
                                                                          0),
                                                                   NA)]

missing_initially <- wdi_stack_valid2[missing_initially == 1]
missing_initially <- missing_initially[, c("country_code", "year",
                                           "series_code"),
                                       with = FALSE]
missing_initially <- unique(missing_initially)
missing_initially <- missing_initially[, missing_initially := 1]
wdi_stack_valid2 <- wdi_stack_valid2[, missing_initially := NULL]

wdi_stack_valid2 <- merge.data.table(
  x = wdi_stack_valid2,
  y = missing_initially,
  by = c("country_code", "year", "series_code"),
  all.x = TRUE
)

wdi_stack_valid2 <- wdi_stack_valid2[, missing_initially := ifelse(is.na(missing_initially),
                                                                   0,
                                                                   missing_initially)]

### By populism (in measured year + in published year)
wdi_stack_valid2_miss_init <- wdi_stack_valid2[missing_initially == 1]
wdi_stack_valid2_miss_init <- wdi_stack_valid2_miss_init[, years_since := version - year]
wdi_stack_valid2_miss_init <- wdi_stack_valid2_miss_init[, was_pop_now_not := as.integer(fpopulism_aer_l1 == 1 &
                                                                                           fpopulism_aer_l1_version == 0)]
wdi_stack_valid2_miss_init <- wdi_stack_valid2_miss_init[, was_pop_now_still := as.integer(fpopulism_aer_l1 == 1 &
                                                                                             fpopulism_aer_l1_version == 1)]
wdi_stack_valid2_miss_init_agg <- wdi_stack_valid2_miss_init[, .(na_rate = mean(is.na(value), na.rm = FALSE)),
                                                             by = c("fpopulism_aer_l1", "years_since")]

## By populism, scientific variables
wdi_stack_valid2_sci <- wdi_stack_valid2[, sci := series_name %in% wdi_science$variable]
wdi_stack_valid2_sci_miss_init <- wdi_stack_valid2_sci[missing_initially == 1]
wdi_stack_valid2_sci_miss_init <- wdi_stack_valid2_sci_miss_init[, years_since := version - year]
wdi_stack_valid2_sci_miss_init <- wdi_stack_valid2_sci_miss_init[, was_pop_now_not := as.integer(fpopulism_aer_l1 == 1 &
                                                                                                   fpopulism_aer_l1_version == 0)]
wdi_stack_valid2_sci_miss_init <- wdi_stack_valid2_sci_miss_init[, was_pop_now_still := as.integer(fpopulism_aer_l1 == 1 &
                                                                                                     fpopulism_aer_l1_version == 1)]
wdi_stack_valid2_sci_miss_init <- wdi_stack_valid2_sci_miss_init[, .(na_rate = mean(is.na(value), na.rm = FALSE)),
                                                                 by = c("sci", "fpopulism_aer_l1", "years_since")]

## Writing data
write_csv(wdi_stack_valid2_miss_init_agg, "backfilling_by_populism.csv")
write_csv(wdi_stack_valid2_sci_miss_init, "backfilling_by_populism_sci.csv")
